# PSRM Dictionary Creation Nov 2025
getwd()
sessionInfo()
#R version 4.3.3 (2024-02-29 ucrt)
#Platform: x86_64-w64-mingw32/x64 (64-bit)
#Running under: Windows 11 x64 (build 26200)

library(dplyr)
library("quanteda")
library('glmnet')
library(tidyverse)
rm(list=ls())

# Coded Articles for Lasso model ----------------------------------------------------

pd_class<-read.csv("pd_5000_final_nov_2024.csv") ## # data mot publicly available due to copyright restriction. 
#File at https://drive.google.com/file/d/1KHr-v86hDbVe7qTvlVda33gFV-u4YOH9/view?usp=sharing for original replication purposes.

pd_class <- pd_class %>%
  mutate(token = gsub("[A-Za-z0-9]", "", token))
pd_dfm <- dfm(tokens(pd_class$token, what = "fasterword"))
pd_dfm_t <- dfm_trim(pd_dfm,min_termfreq = 5)
set.seed(500)
training <-  sample(1:nrow(pd_class), floor(.80 * nrow(pd_class)))
test <- (1:nrow(pd_class))[1:nrow(pd_class) %in% training == FALSE]
lasso <- cv.glmnet(x=pd_dfm_t[training,], y=as.numeric(pd_class$final[training]), 
                   family="binomial", alpha=1, nfolds=5, parallel=TRUE,
                   intercept=TRUE, type.measure="class")

quanteda_options(threads = parallel::detectCores()) 
plot(lasso)

performance <- function(ypred, y){
  tab <- table(ypred, y)
  accuracy <- (tab[1,1]+tab[2,2])/sum(tab)
  precision <- (tab[2,2])/(tab[2,1]+tab[2,2])
  recall <- tab[2,2]/(tab[1,2]+tab[2,2])
  message("Accuracy = ", round(accuracy, 2), "\n",
         "Precision = ", round(precision, 2), "\n",
          "Recall = ", round(recall, 2))}

# computing predicted values
preds <- predict(lasso, pd_dfm_t[test,], type="class")

# confusion matrix
table(preds, pd_class$final[test])
performance(preds, pd_class$final[test])

# Most predictive features
best.lambda <- which(lasso$lambda==lasso$lambda.1se)
beta <- lasso$glmnet.fit$beta[,best.lambda]
df <- data.frame(coef = as.numeric(beta),
                 word = names(beta), stringsAsFactors=F)

#df <- df[order(df$coef, decreasing=TRUE),] # Inspecting the most predicted words
#head(df[,c("coef", "word")], n=30) # Inspecting the most predicted words

# Applying Lasso Classifer to Full People's Daily Data --------------------------------------------

# Data of the People's Daily 
# data mot publicly available due to copyright restriction. 
# File at https://drive.google.com/file/d/14uTzFegd-WIeebfcZN91DC3p3Ftmok6q/view?usp=drive_link for original replication purposes.

pd <- read.csv('pd_since_1949.csv') 
pd_all<-pd[pd$token != "", ] # removing no-content rows
pd <-NULL
######### Dividing the data by leadership
pd_all$date <- as.Date(pd_all$date)
pd_mao <- subset(pd_all, date >= as.Date("1949-01-01") & date <= as.Date("1976-12-31"))
pd_deng <- subset(pd_all, date >= as.Date("1977-01-01") & date <= as.Date("1992-12-31"))
pd_jiang <- subset(pd_all, date >= as.Date("1993-01-01") & date <= as.Date("2002-12-31"))
pd_hu <- subset(pd_all, date >= as.Date("2003-01-01") & date <= as.Date("2012-12-31"))
pd_xi <- subset(pd_all, date >= as.Date("2013-01-01") & date <= as.Date("2023-12-31"))
pd_all <- NULL

nrow(pd_mao)
nrow(pd_deng)
nrow(pd_jiang)
nrow(pd_hu)
nrow(pd_xi)


##########  Applying Classifer to Different Leaderships 

# Xi ----------------------------------------------------------------------

pd_xi <- pd_xi %>%
  mutate(token = gsub("[A-Za-z0-9]", "", token))

xi_dfm <- dfm(tokens(pd_xi$token, what = "fasterword"))
xi_dfm_matched <- dfm_match(xi_dfm, features = featnames(pd_dfm_t))
preds_xi <- predict(lasso, newx = xi_dfm_matched, type = "class")

pd_xi$predictions <- as.numeric(preds_xi[1:nrow(pd_xi)])
pd_xi_foreign <- pd_xi[pd_xi$predictions == 1, ]

write.csv(pd_xi_foreign, "pd_xi_foreign_nov_2024.csv") # For replication

# Hu ----------------------------------------------------------------------

pd_hu <- pd_hu %>%
  mutate(token = gsub("[A-Za-z0-9]", "", token))

hu_dfm <- dfm(tokens(pd_hu$token, what = "fasterword"))
hu_dfm_matched <- dfm_match(hu_dfm, features = featnames(pd_dfm_t))
preds_hu <- predict(lasso, newx = hu_dfm_matched, type = "class")

pd_hu$predictions <- as.numeric(preds_hu[1:nrow(pd_hu)])
pd_hu_foreign <- pd_hu[pd_hu$predictions == 1, ]

write.csv(pd_hu_foreign, "pd_hu_foreign_nov_2024.csv") # For replication 

# Mao ---------------------------------------------------------------------
pd_mao <- pd_mao %>%
  mutate(token = gsub("[A-Za-z0-9]", "", token))

mao_dfm <- dfm(tokens(pd_mao$token, what = "fasterword"))
mao_dfm_matched <- dfm_match(mao_dfm, features = featnames(pd_dfm_t))
preds_mao <- predict(lasso, newx = mao_dfm_matched, type = "class")

pd_mao$predictions <- as.numeric(preds_mao[1:nrow(pd_mao)])

pd_mao_foreign <- pd_mao[pd_mao$predictions == 1, ]

write.csv(pd_mao_foreign, "pd_mao_foreign_nov_2024.csv") # For replication 
# Deng ---------------------------------------------------------------------

pd_deng <- pd_deng %>%
  mutate(token = gsub("[A-Za-z0-9]", "", token))

deng_dfm <-dfm(tokens(pd_deng$token, what = "fasterword"))
deng_dfm_matched <- dfm_match(deng_dfm, features = featnames(pd_dfm_t))
preds_deng <- predict(lasso, newx = deng_dfm_matched, type = "class")

pd_deng$predictions <- as.numeric(preds_deng[1:nrow(pd_deng)])

pd_deng_foreign <- pd_deng[pd_deng$predictions == 1,]

write.csv(pd_deng_foreign, "pd_deng_foreign_nov_2024.csv") # For replication 

# Jiang ---------------------------------------------------------------------
pd_jiang <- pd_jiang %>%
  mutate(token = gsub("[A-Za-z0-9]", "", token))

jiang_dfm <- dfm(tokens(pd_jiang$token, what = "fasterword"))
jiang_dfm_matched <- dfm_match(jiang_dfm, features = featnames(pd_dfm_t))
preds_jiang <- predict(lasso, newx = jiang_dfm_matched, type = "class")

pd_jiang$predictions <- as.numeric(preds_jiang[1:nrow(pd_jiang)])

pd_jiang_foreign <- pd_jiang[pd_jiang$predictions == 1, ]

write.csv(pd_jiang_foreign, "pd_jiang_foreign_nov_2024.csv") # For replication 

# Appendix 1 Table 1 Column 1 (p.1) --------------------------------------------------

nrow(pd_mao)
nrow(pd_deng)
nrow(pd_jiang)
nrow(pd_hu)
nrow(pd_xi)

# Appendix 1 Table 1 Column 2 (p. 1) ------------------------------------
#67430+ 38178+ 113531 + 36729 + 40453 = Total Observations

nrow(pd_mao_foreign)
nrow(pd_deng_foreign)
nrow(pd_jiang_foreign)
nrow(pd_hu_foreign)
nrow(pd_xi_foreign)


# Convert the tokenized objects by leaders into text files for word2vec model --------


writeLines(pd_xi_foreign$token, "xi_tokens_nov_2024.txt") # for replication
writeLines(pd_hu_foreign$token, "hu_tokens_nov_2024.txt") # for replication
writeLines(pd_jiang_foreign$token, "jiang_tokens_nov_2024.txt") # for replication
writeLines(pd_mao_foreign$token, "mao_token_nov_2024.txt") # for replication
writeLines(pd_deng_foreign$token, "deng_tokens_nov_2024.txt") # for replication












